(25 pts) Use Seurat and CellChat to analyze the single cell RNA-seq dataset of A549 lung carcinoma cells (the dataset is available on canvas/Files/Final/A549):

  1. Setup the Seurat object and perform quality control analysis. Please explain the reasoning of your thresholds for quality control. Use violin plots in your explanation.

  2. Normalize data, detect highly variable genes and scale the data.

library(Seurat)
## Loading required package: SeuratObject
## Loading required package: sp
## 'SeuratObject' was built under R 4.3.1 but the current version is
## 4.3.2; it is recomended that you reinstall 'SeuratObject' as the ABI
## for R may have changed
## 'SeuratObject' was built with package 'Matrix' 1.6.3 but the current
## version is 1.6.5; it is recomended that you reinstall 'SeuratObject' as
## the ABI for 'Matrix' may have changed
## 
## Attaching package: 'SeuratObject'
## The following object is masked from 'package:base':
## 
##     intersect
library(magrittr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
setwd("~/Desktop/big_data_week_1/A549")

# LOAD DATA AND MAKE SEURAT OBJECT


A549.data = readRDS("raw.data.rds")
A549 <- CreateSeuratObject(counts = A549.data, project = "A549", min.cells = 3, min.features = 200)
A549
## An object of class Seurat 
## 20210 features across 5867 samples within 1 assay 
## Active assay: RNA (20210 features, 0 variable features)
##  1 layer present: counts
head(colnames(A549))
## [1] "AAACCCAAGCTGAGCA-1" "AAACCCAAGCTGTCCG-1" "AAACCCACACAAATGA-1"
## [4] "AAACCCAGTATCTTCT-1" "AAACCCAGTCGATTCA-1" "AAACGAACAAATTGCC-1"
head(rownames(A549)) 
## [1] "AL627309.1" "AL627309.5" "LINC01409"  "LINC01128"  "LINC00115" 
## [6] "FAM41C"
head(A549@meta.data) 
##                    orig.ident nCount_RNA nFeature_RNA
## AAACCCAAGCTGAGCA-1       A549      11461         3434
## AAACCCAAGCTGTCCG-1       A549      12816         3529
## AAACCCACACAAATGA-1       A549       3947         1195
## AAACCCAGTATCTTCT-1       A549       2820         1157
## AAACCCAGTCGATTCA-1       A549      21631         4992
## AAACGAACAAATTGCC-1       A549        813          530
#VIOLIN PLOT BEFORE
VlnPlot(A549, features = c("nFeature_RNA", "nCount_RNA"), ncol = 3, pt.size = 0)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

plot1 <- FeatureScatter(A549, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")







#REASONING BEHIND THRESHOLDS - 
#n Feature shows number of detected genes for every cell
#n Count shows nymber of unique molecular identifiers for every cell
# The reasoning behind QC thresholds in remove noise from the data set in order to have a more accurate visualization of the RNA data we are analyzing. The also increase mapping quality for UMAP.
# For our QC we set our nFeature or number of genes for each cell from 50 to 6000. We also set our nCount or molecular identifiers for each cell from 0 to 25000

# QC
A549 <- subset(A549, subset = nFeature_RNA > 50 & nFeature_RNA < 6000 & nCount_RNA < 25000 & nCount_RNA > 0)
# NORMALIZING DATA
A549 <- NormalizeData(A549, normalization.method = "LogNormalize", scale.factor = 10000)
## Normalizing layer: counts
A549 <- NormalizeData(A549)
## Normalizing layer: counts
# After QC metrics you see more visible violin plot
VlnPlot(A549, features = c("nFeature_RNA", "nCount_RNA"), 
        ncol = 3,pt.size=0)

A549 <- FindVariableFeatures(A549, selection.method = "vst", nfeatures = 2000)
## Finding variable features for layer counts
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(A549), 10)
top10
##  [1] "NTS"      "TAGLN"    "CEACAM6"  "NPTX1"    "IGFL1"    "GPX2"    
##  [7] "DHRS2"    "HIST1H4C" "MUC5B"    "IGFL2"
plot1 <- VariableFeaturePlot(A549)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot1  

plot2

all.genes <- rownames(A549)
#SCALE DATA
A549 <- ScaleData(A549, features = all.genes)
## Centering and scaling data matrix
  1. Perform PCA analysis and choose the number of PCs for further analysis. Explain your reasoning.
  2. Cluster the cells, and test different values of the resolution parameter in FindClusters(). Explain your reasoning for choosing the resolution value for further analysis.
  3. Visualize your clustering results in 2d via UMAP. Do this for at least two different values of resolution. How do results from FindClusters() change as you modify the resolution parameter?
A549 <- RunPCA(A549, features = VariableFeatures(object = A549))
## PC_ 1 
## Positive:  MALAT1, NEAT1, PAPPA, FN1, KCNQ1OT1, COL5A2, TGFBI, SAT1, COL4A1, NRP2 
##     COL5A1, LINC00472, DST, FOSB, MEG3, COL18A1, CFH, COL4A2, IGFL2-AS1, CNTN1 
##     THBS1, C3, LRP1, MEGF8, HSPG2, ITGA11, VCAN, ITGA2, NFKBIZ, COL7A1 
## Negative:  PFN1, PTMA, TUBA1B, CFL1, HMGB1, H2AFZ, RPS2, FTH1, RPL8, TUBB 
##     DYNLL1, RAN, GAPDH, NME2, RPS12, ACTB, RPL12, JPT1, RPS17, RPS6 
##     FTL, GSTP1, HSPE1, ACTG1, TXN, TMSB10, PRDX1, RPL26, CKS1B, RPL28 
## PC_ 2 
## Positive:  AGR2, GPX2, AKR1C2, AKR1B10, FGB, SLPI, ALDH3A1, MUC5B, AKR1C1, CPLX2 
##     FGA, AKR1C3, ELF3, CA12, S100P, PDLIM5, FGG, SLC9A3R2, NTS, SLC12A2 
##     MTUS1, MALL, CFD, SQSTM1, STEAP1, GCLC, FTL, AQP3, FGL1, TXNRD1 
## Negative:  PMEPA1, TGFBI, TPM1, TAGLN, IGFBP7, RDX, NRG1, CAV1, TGM2, NPC2 
##     COL5A1, GLIPR1, SOX4, COL4A2, CCDC80, TPM2, IL11, COL4A1, NPTX1, PDLIM7 
##     SERPINE1, RBP1, ZFP36L1, DUSP1, CD59, KRT7, AMIGO2, PGM2L1, VIM, TIMP2 
## PC_ 3 
## Positive:  FTH1, FTL, KRT8, GAPDH, AKR1C3, S100A6, NQO1, RPS12, GSTP1, TMSB10 
##     S100A4, S100A11, BLVRB, RPS6, S100A10, GPX2, ANXA4, KRT18, RPL8, RACK1 
##     AKR1C1, RPL26, TMSB4X, ALDH3A1, RPL12, TXN, RPS2, NME2, PKM, POLD4 
## Negative:  CENPF, TOP2A, MKI67, ASPM, PRC1, TPX2, HMMR, CENPE, PLK1, KIF23 
##     UBE2C, AURKB, CCNB1, SMC4, GTSE1, AURKA, KIF20B, UBE2S, DLGAP5, ANLN 
##     HMGB2, KIF14, KIF11, KPNA2, DEPDC1, CDC20, PRR11, CDCA2, PIF1, ARHGAP11A 
## PC_ 4 
## Positive:  RPS2, PTTG1, RPL8, RPL28, FABP5, RPL22L1, CCDC85B, COTL1, CDKN3, ACTG1 
##     RPL26, TUBA1B, RAN, RPL12, KRT81, FTH1, RPS12, TMSB4X, H2AFZ, NPM1 
##     PTMA, HSPE1, TNNT1, GAPDH, RPS6, IGFBP4, HMGB1, MRPL12, CKS1B, GPX4 
## Negative:  MALAT1, NEAT1, ANXA4, FN1, TRAM1, CNTN1, ATP1B1, MTUS1, SPTBN1, GPX2 
##     DST, SCD, DSP, PDLIM5, ELF3, CLDN2, SLC12A2, TM4SF20, KRT19, VCAN 
##     C3, AKR1C1, CFH, SYNE2, PAPPA, AHNAK2, ALDH1A1, PTBP3, EPS8, AGR2 
## PC_ 5 
## Positive:  ARL6IP1, CCNB1, CCNB2, PTTG1, KPNA2, PLK1, DLGAP5, HMMR, KIF20A, CDC20 
##     CENPE, AURKA, PRR11, TOP2A, CENPF, NEK2, CKS2, CENPA, DEPDC1, CDCA3 
##     NUSAP1, SOX4, CDKN3, TPX2, UBE2C, TUBA1C, BTG1, UBE2S, GTSE1, PRC1 
## Negative:  FAM111B, HIST1H1D, MCM10, HIST1H4C, CDC6, GINS2, CLSPN, MCM4, MCM3, E2F1 
##     BRCA1, ATAD2, MCM6, FEN1, MCM5, DTL, HELLS, MSH6, CDT1, UNG 
##     PCNA, CCNE1, HIST1H1E, CHAF1A, ORC6, MCM7, HIST1H1B, CDC45, MCM2, RFC2
print(A549[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1 
## Positive:  MALAT1, NEAT1, PAPPA, FN1, KCNQ1OT1 
## Negative:  PFN1, PTMA, TUBA1B, CFL1, HMGB1 
## PC_ 2 
## Positive:  AGR2, GPX2, AKR1C2, AKR1B10, FGB 
## Negative:  PMEPA1, TGFBI, TPM1, TAGLN, IGFBP7 
## PC_ 3 
## Positive:  FTH1, FTL, KRT8, GAPDH, AKR1C3 
## Negative:  CENPF, TOP2A, MKI67, ASPM, PRC1 
## PC_ 4 
## Positive:  RPS2, PTTG1, RPL8, RPL28, FABP5 
## Negative:  MALAT1, NEAT1, ANXA4, FN1, TRAM1 
## PC_ 5 
## Positive:  ARL6IP1, CCNB1, CCNB2, PTTG1, KPNA2 
## Negative:  FAM111B, HIST1H1D, MCM10, HIST1H4C, CDC6
VizDimLoadings(A549, dims = 1:2, reduction = "pca")

DimPlot(A549, reduction = "pca") + NoLegend()

DimHeatmap(A549, dims = 1, cells = 500, balanced = TRUE)

DimHeatmap(A549, dims = 1:10, cells = 500, balanced = TRUE)

ElbowPlot(A549) # Based off of the elbow plot the optimal number of PC's  is 10 due to the fact that variability drastically does not change after 10 PC's

# Cluster the cells

A549 <- FindNeighbors(A549, dims = 1:10)
## Computing nearest neighbor graph
## Computing SNN
# Resolution for cells is based on specificity of the visualization. A very high resolution will have many clusters while a very low resolution will have little clusters. 
A549 <- FindClusters(A549, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 5755
## Number of edges: 184322
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8326
## Number of communities: 7
## Elapsed time: 0 seconds
head(Idents(A549), 10)
## AAACCCAAGCTGAGCA-1 AAACCCAAGCTGTCCG-1 AAACCCACACAAATGA-1 AAACCCAGTATCTTCT-1 
##                  1                  4                  0                  0 
## AAACCCAGTCGATTCA-1 AAACGAACAAATTGCC-1 AAACGAACAGGTTCGC-1 AAACGAAGTCTTGAAC-1 
##                  3                  6                  0                  2 
## AAACGAATCTCTGGTC-1 AAACGCTAGAGTGAAG-1 
##                  5                  1 
## Levels: 0 1 2 3 4 5 6
A549 <- RunUMAP(A549, dims = 1:10)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 23:18:39 UMAP embedding parameters a = 0.9922 b = 1.112
## 23:18:39 Read 5755 rows and found 10 numeric columns
## 23:18:39 Using Annoy for neighbor search, n_neighbors = 30
## 23:18:39 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 23:18:39 Writing NN index file to temp file /var/folders/8l/kydfwjds0pdf_9nnjjth36t80000gn/T//Rtmpw5bZKo/file5ece4ee22514
## 23:18:39 Searching Annoy index using 1 thread, search_k = 3000
## 23:18:40 Annoy recall = 100%
## 23:18:40 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 23:18:41 Initializing from normalized Laplacian + noise (using RSpectra)
## 23:18:41 Commencing optimization for 500 epochs, with 227810 positive edges
## 23:18:45 Optimization finished
DimPlot(A549, reduction = "umap",label=TRUE)

# Visualize clustering results for resolution = 1.0
A549_1 <- FindClusters(A549, resolution = 2)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 5755
## Number of edges: 184322
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.6962
## Number of communities: 20
## Elapsed time: 0 seconds
# Run UMAP
A549_1 <- RunUMAP(A549_1, dims = 1:10)
## 23:18:45 UMAP embedding parameters a = 0.9922 b = 1.112
## 23:18:45 Read 5755 rows and found 10 numeric columns
## 23:18:45 Using Annoy for neighbor search, n_neighbors = 30
## 23:18:45 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 23:18:46 Writing NN index file to temp file /var/folders/8l/kydfwjds0pdf_9nnjjth36t80000gn/T//Rtmpw5bZKo/file5ece60f06c81
## 23:18:46 Searching Annoy index using 1 thread, search_k = 3000
## 23:18:47 Annoy recall = 100%
## 23:18:47 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 23:18:47 Initializing from normalized Laplacian + noise (using RSpectra)
## 23:18:47 Commencing optimization for 500 epochs, with 227810 positive edges
## 23:18:51 Optimization finished
# Plot UMAP for resolution = 1.0
DimPlot(A549_1, reduction = "umap", label = TRUE, pt.size = 1) 

# I am going to use resolution 0.6 because it is between the two extremes of resolution and it shows solid clustering 2.0 resolutions seems to cluster too much however I feel as if it is more accurate due to this being a larger data set.
A549 <- FindClusters(A549, resolution = 0.625)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 5755
## Number of edges: 184322
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8112
## Number of communities: 8
## Elapsed time: 0 seconds
head(Idents(A549), 5)
## AAACCCAAGCTGAGCA-1 AAACCCAAGCTGTCCG-1 AAACCCACACAAATGA-1 AAACCCAGTATCTTCT-1 
##                  5                  4                  0                  0 
## AAACCCAGTCGATTCA-1 
##                  1 
## Levels: 0 1 2 3 4 5 6 7
A549 <- RunUMAP(A549, dims = 1:10)
## 23:18:52 UMAP embedding parameters a = 0.9922 b = 1.112
## 23:18:52 Read 5755 rows and found 10 numeric columns
## 23:18:52 Using Annoy for neighbor search, n_neighbors = 30
## 23:18:52 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 23:18:53 Writing NN index file to temp file /var/folders/8l/kydfwjds0pdf_9nnjjth36t80000gn/T//Rtmpw5bZKo/file5ece4863badf
## 23:18:53 Searching Annoy index using 1 thread, search_k = 3000
## 23:18:54 Annoy recall = 100%
## 23:18:54 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 23:18:54 Initializing from normalized Laplacian + noise (using RSpectra)
## 23:18:54 Commencing optimization for 500 epochs, with 227810 positive edges
## 23:18:58 Optimization finished
DimPlot(A549, reduction = "umap",label=TRUE)

  1. Find differentially expressed genes for each cluster and make a heat map showing the top 3 up-regulated genes per cluster.
  2. Assign cell type identity to clusters. Some canonical markers are listed below (see table on next page), but additional markers may be needed from the literature. Please use violin plot, feature plot and dot plot to show how you perform the cell type identification and confirm that annotated cell types express the expected genes.
# Finding markers of genes for cluster 0
cluster0.markers <- FindMarkers(A549, ident.1 = 0)
head(cluster0.markers, n = 10)
##         p_val avg_log2FC pct.1 pct.2 p_val_adj
## COA3        0  -2.356708 0.135 0.877         0
## PPDPF       0  -2.166215 0.179 0.903         0
## BANF1       0  -2.089261 0.295 0.966         0
## HSBP1       0  -1.910117 0.299 0.965         0
## NME4        0  -1.908669 0.307 0.956         0
## GABARAP     0  -1.900868 0.341 0.978         0
## POLR2I      0  -1.846981 0.331 0.965         0
## DBI         0  -1.854874 0.346 0.975         0
## NHP2        0  -1.838152 0.346 0.963         0
## ERH         0  -1.761156 0.359 0.974         0
# Finding markers of genes for cluster 1
cluster1.markers <- FindMarkers(A549, ident.1 = 1)
head(cluster1.markers, n = 10)
##                p_val avg_log2FC pct.1 pct.2     p_val_adj
## TPM1   1.209910e-288  1.5807827 0.997 0.915 2.445227e-284
## ACTG1  4.310457e-228  0.7235021 1.000 0.985 8.711434e-224
## TAGLN  1.074494e-213  1.9780730 0.732 0.241 2.171553e-209
## IL11   3.114080e-191  1.7907052 0.603 0.159 6.293557e-187
## PMEPA1 6.438115e-186  1.0123999 0.999 0.815 1.301143e-181
## ACTB   3.500561e-184  0.5655147 1.000 0.986 7.074634e-180
## KRT7   4.613388e-165  0.7310971 1.000 0.971 9.323656e-161
## CFL1   9.949463e-164  0.6059308 1.000 0.920 2.010786e-159
## RPL28  1.830770e-156  0.5523915 1.000 0.986 3.699986e-152
## TAGLN2 1.753102e-152  0.7986930 0.995 0.828 3.543020e-148
# Finding markers of genes for cluster 2
cluster2.markers <- FindMarkers(A549, ident.1 = 2)
head(cluster2.markers, n = 10)
##                  p_val avg_log2FC pct.1 pct.2     p_val_adj
## HIST1H4C 4.195899e-221  1.6725503 0.996 0.721 8.479912e-217
## HIST1H1D 3.040653e-169  1.9229910 0.680 0.243 6.145159e-165
## GINS2    2.193770e-150  1.3605728 0.818 0.372 4.433610e-146
## CDC6     3.478890e-145  1.5229886 0.647 0.226 7.030836e-141
## HIST1H1B 9.557096e-137  2.6163462 0.315 0.053 1.931489e-132
## FTL      3.787065e-131  0.5939320 1.000 1.000 7.653659e-127
## PCNA     7.190333e-122  0.8839408 0.977 0.716 1.453166e-117
## MCM10    6.486810e-121  1.4250327 0.604 0.222 1.310984e-116
## HIST1H1E 4.734428e-117  1.7092959 0.483 0.152 9.568279e-113
## TXN      1.618545e-109  0.5409198 1.000 0.931 3.271080e-105
# Finding markers of genes for cluster 3
cluster3.markers <- FindMarkers(A549, ident.1 = 3)
head(cluster3.markers, n = 10)
##                 p_val avg_log2FC pct.1 pct.2     p_val_adj
## TMSB4X  6.355090e-179  0.6938295 1.000 0.983 1.284364e-174
## RPS19   6.669485e-160  0.4621074 1.000 1.000 1.347903e-155
## EEF1A1  1.176821e-144  0.5197219 1.000 0.980 2.378355e-140
## RPL10   1.343930e-141  0.4807661 1.000 0.993 2.716082e-137
## S100A11 5.182629e-134  0.6733566 1.000 0.924 1.047409e-129
## MKI67   2.781633e-133 -2.8205779 0.215 0.647 5.621680e-129
## RDX     4.463185e-129  0.8911422 0.993 0.906 9.020096e-125
## TOP2A   3.963670e-128 -1.9893683 0.493 0.786 8.010578e-124
## CENPF   8.712957e-122 -2.2650856 0.385 0.727 1.760889e-117
## RPL28   1.584062e-121  0.5567455 1.000 0.987 3.201389e-117
#Findng markers for gene for cluster 4
cluster4.markers <- FindMarkers(A549, ident.1 = 4)
head(cluster4.markers, n = 10)
##                   p_val avg_log2FC pct.1 pct.2     p_val_adj
## NTS        0.000000e+00  3.3221576 0.991 0.489  0.000000e+00
## LINC02762 6.873426e-291  3.7379763 0.438 0.033 1.389119e-286
## ALX1      1.122712e-234  3.0632122 0.498 0.070 2.269000e-230
## CPS1      5.494507e-232  2.3973933 0.604 0.111 1.110440e-227
## SUSD2     1.512334e-167  2.2627432 0.495 0.100 3.056427e-163
## PRELID1   2.159274e-142  0.8202541 1.000 0.868 4.363893e-138
## PKIB      4.609794e-140  2.8635654 0.306 0.040 9.316394e-136
## ANXA1     3.657930e-136  0.8111787 1.000 0.963 7.392676e-132
## MUC5B     1.838484e-129  1.7421409 0.486 0.121 3.715575e-125
## RACK1     1.389478e-127  0.5857516 1.000 0.954 2.808136e-123
# Finding markers of genes for cluster 5
cluster5.markers <- FindMarkers(A549, ident.1 = 5, ident.2 = c(0, 3))
head(cluster5.markers, n = 10)
##                 p_val avg_log2FC pct.1 pct.2     p_val_adj
## FTL     1.742295e-222  1.4032189 1.000 1.000 3.521177e-218
## FTH1    9.736933e-189  1.2050120 1.000 0.998 1.967834e-184
## TXN     6.786307e-180  1.2450262 0.998 0.847 1.371513e-175
## GPX2    6.630520e-172  2.4711104 0.920 0.394 1.340028e-167
## AKR1B10 7.738433e-157  1.5591205 0.995 0.840 1.563937e-152
## NQO1    9.844182e-153  1.0717858 0.998 0.884 1.989509e-148
## PRDX1   4.232110e-147  0.9553572 1.000 0.897 8.553094e-143
## MT-CYB  2.927467e-136 -1.5711710 0.995 0.998 5.916412e-132
## ALDH3A1 1.500572e-134  1.7276622 0.997 0.722 3.032656e-130
## AKR1C2  2.501677e-134  1.2610034 0.993 0.779 5.055889e-130
# Finding markers of genes for cluster 6
cluster6.markers <- FindMarkers(A549, ident.1 = 6)
head(cluster6.markers, n = 10)
##               p_val avg_log2FC pct.1 pct.2     p_val_adj
## CKS2  4.423762e-187   1.591559 0.996 0.692 8.940424e-183
## AURKA 2.862877e-180   1.912712 0.911 0.361 5.785875e-176
## UBE2S 2.875244e-175   1.479094 0.998 0.767 5.810867e-171
## PLK1  1.027224e-172   1.890038 0.872 0.296 2.076019e-168
## CCNB1 6.963263e-168   1.581121 0.996 0.575 1.407275e-163
## HMMR  3.787571e-162   1.766277 0.899 0.339 7.654681e-158
## TPX2  9.491534e-154   1.638651 0.955 0.492 1.918239e-149
## UBE2C 4.837651e-151   1.713232 0.923 0.444 9.776893e-147
## KPNA2 1.340274e-149   1.291719 0.990 0.778 2.708693e-145
## TOP2A 2.099512e-146   1.323991 1.000 0.718 4.243114e-142
# Finding markers of genes for cluster 7
cluster7.markers <- FindMarkers(A549, ident.1 = 7)
head(cluster7.markers, n = 10)
##                  p_val avg_log2FC pct.1 pct.2    p_val_adj
## NEAT1     1.635335e-32  -3.410850 0.134 0.983 3.305013e-28
## MT-CO3    1.444135e-30  -2.082792 0.642 1.000 2.918597e-26
## MT-ND4    3.768157e-27  -1.992682 0.403 0.999 7.615446e-23
## MT-ND2    4.156535e-27  -2.009570 0.194 0.998 8.400357e-23
## MT-CO1    5.059390e-27  -1.989446 0.612 0.999 1.022503e-22
## MT-ND1    8.612020e-27  -1.953392 0.299 0.999 1.740489e-22
## MT-CO2    2.181771e-26  -1.822864 0.612 0.999 4.409359e-22
## MT-ND6    3.529342e-26  -2.481670 0.090 0.886 7.132800e-22
## MT-ND3    4.527261e-26  -1.882049 0.328 0.998 9.149594e-22
## MTRNR2L12 1.214884e-25  -2.108997 0.090 0.885 2.455281e-21
# Finding markers of genes for cluster 8


#finds marker for every cluster
A549.markers <- FindAllMarkers(A549, only.pos = TRUE)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
A549.markers %>%
    group_by(cluster) %>%
    dplyr::filter(avg_log2FC > 1)
## # A tibble: 2,557 × 7
## # Groups:   cluster [8]
##    p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene     
##    <dbl>      <dbl> <dbl> <dbl>     <dbl> <fct>   <chr>    
##  1     0       2.89 0.963 0.852         0 0       MTRNR2L12
##  2     0       2.25 0.934 0.862         0 0       MT-ND6   
##  3     0       1.94 0.998 0.984         0 0       MT-ND5   
##  4     0       1.91 0.998 0.986         0 0       MT-ND2   
##  5     0       2.24 0.999 0.988         0 0       MT-ND1   
##  6     0       2.07 0.999 0.988         0 0       MT-ND3   
##  7     0       1.96 0.999 0.99          0 0       MT-ND4   
##  8     0       2.12 0.998 0.99          0 0       MT-CYB   
##  9     0       2.22 1     0.994         0 0       MT-CO3   
## 10     0       2.09 0.998 0.993         0 0       MALAT1   
## # ℹ 2,547 more rows
# Extract top 3 markers per cluster
top_markers_per_cluster <- A549.markers %>%
    group_by(cluster) %>%
    top_n(5, avg_log2FC) %>%
    ungroup()

# Create a heatmap
DoHeatmap(A549, features = top_markers_per_cluster$gene) + NoLegend()

# Transpose the expression matrix




# Violin Plot to visualize marker expression 

VlnPlot(A549, features = c( "TBX6", "FOXA2"))

VlnPlot(A549, features = c("SOX9"   ,  "MUC5AC"))

VlnPlot(A549, features = c("MUC13" ,  "MUC1"))

VlnPlot(A549, features = c("CFTR" ,  "CD44"))

# early mesoderm TBX6, endoderm FOXA2, lung proginator cells SOX9, cilated cells MUC5AC, goblet cells MUC13, clara cells MUC1, alveolar type CFTR, cells cancer stem cells CD44 
FeaturePlot(A549, features = c("TBX6", "FOXA2", "SOX9","MUC5AC", "MUC13", "MUC1" , "CFTR" , "CD44"))

DotPlot(A549, features = c("TBX6", "FOXA2", "SOX9","MUC5AC", "MUC13", "MUC1" , "CFTR" , "CD44"))

# TBX6 1.3, 2.3    -----> Cluster 3
# FOXA2 1. ,2.6 checked ----> Cluster 6
# SOX9 1.1, 2.1 - ------> Cluster 1 
#MUC5AC 1.7, 2.7 ** ----> cluster 7
#MUC13, 1. 2.4 -----> cluster 4
#MUC1 , 2.5 -----> cluster 5 
#CFTR 2.0, 0 ---> Cluster 2
#CD44 2.2 -----> CLuster 0

new.cluster.ids <- c("cancer stem cells", "lung proginator", "alveolar cell", "early mesoderm", "goblet cells", "clara cells",
    "endoderm", "cilated cells ")
names(new.cluster.ids) <- levels(A549)
A549 <- RenameIdents(A549, new.cluster.ids)
DimPlot(A549, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()

Create the CellChat object. i) Set up the ligand-receptor interaction database, identify the over-expressed ligands and receptors and compute the communication probability at the signaling pathway level.

library(CellChat)
## Loading required package: igraph
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following object is masked from 'package:Seurat':
## 
##     components
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
## Loading required package: ggplot2
# Make CellChat object
cellchat <- createCellChat(object = A549, group.by = "ident", assay = "RNA")
## [1] "Create a CellChat object from a Seurat object"
## The `meta.data` slot in the Seurat object is used as cell meta information
## Warning in createCellChat(object = A549, group.by = "ident", assay = "RNA"):
## The 'meta' data does not have a column named `samples`. We now add this column
## and all cells are assumed to belong to `sample1`!
## Set cell identities for the new CellChat object 
## The cell groups used for CellChat analysis are  cancer stem cells, lung proginator, alveolar cell, early mesoderm, goblet cells, clara cells, endoderm, cilated cells
# Set up database
CellChatDB <- CellChatDB.human  # or use CellChatDB.mouse if your data is mouse
showDatabaseCategory(CellChatDB)

CellChatDB.use <- subsetDB(CellChatDB, search = "Secreted Signaling", key = "annotation")  # Use Secreted Signaling only
cellchat@DB <- CellChatDB.use

# Subset data (necessary)
cellchat <- subsetData(cellchat)  # This step is necessary even if using the whole database

# Preprocessing
cellchat <- identifyOverExpressedGenes(cellchat)
cellchat <- identifyOverExpressedInteractions(cellchat)
## The number of highly variable ligand-receptor pairs used for signaling inference is 447
# Infer communication probability
cellchat <- computeCommunProb(cellchat, type = "triMean")
## triMean is used for calculating the average gene expression per cell group. 
## [1] ">>> Run CellChat on sc/snRNA-seq data <<< [2024-03-18 23:19:28.741889]"
## [1] ">>> CellChat inference is done. Parameter values are stored in `object@options$parameter` <<< [2024-03-18 23:20:01.961318]"
cellchat <- filterCommunication(cellchat, min.cells = 10)

# Compute communication probability at the pathway level
cellchat <- computeCommunProbPathway(cellchat)
  1. Visualize the aggregated cell-cell communication network by showing both the number of interactions and the total interaction strength (weights) between any two cell groups using circle plot. Describe possible biological findings you obtain.
  2. Visualize one particular signaling pathway of your choice using Hierarchy plot, Circle plot and Chord diagram. Describe your findings.
# Aggregate pathways
cellchat <- aggregateNet(cellchat)
# Visualize pathways (aggregate)
groupSize <- as.numeric(table(cellchat@idents))
par(mfrow = c(1,2), xpd=TRUE)

# Circle plot for number fo interactuions and interaction weights/strengths
netVisual_circle(cellchat@net$count, vertex.weight = groupSize, weight.scale = T, label.edge= F, title.name = "Number of interactions")
netVisual_circle(cellchat@net$weight, vertex.weight = groupSize, weight.scale = T, label.edge= F, title.name = "Interaction weights/strength")

#Circle

netVisual_aggregate(cellchat, signaling = "WNT", layout = "circle")

#In the Wnt pathway there are a lot of connections from mesoderm to endoderm This makes sense because the WNT signalling pathway is responsible for development mesoderm and endoderm in embryo. Aveolar and lung proginator cells also link with with the endoderm this is because the endoderm in development leads to layer of tissue that make up the aveolar and lung proginator cells . Alveolar and lung proginator cells have very strong connections with goblet and clara cells through this pathway. Cancer Stem cells and cilated cells have weak connections to everyhting throgh the Wnt pathway.

# Chord
par(mfrow=c(1,1))

netVisual_aggregate(cellchat, signaling = "WNT", layout = "chord")

# Clara Cells seem to have a strong connection with lung proginator cells. Cilated cells seem to have no connection to anything through the Wnt pathway. Everyone of the cells except cilated cells seem to be connected one way or another.

#HEiarchial

#Goblet cells seem to be conncecting to everything except clara cells. Cilated cells has no connection with anything.

vertex.receiver = seq(1,4) # a numeric vector. 
netVisual_aggregate(cellchat, signaling = "WNT",  vertex.receiver = vertex.receiver)

  1. Compute the contribution of each ligand-receptor pair to the chosen signaling pathway from k) and visualize the cell-cell communication mediated by one single ligand-receptor pair using Hierarchy plot, Circle plot or Chord diagram.
  2. Identify signaling roles and visualize the computed centrality scores using heatmap. Explain what each centrality score means.
  3. Visualize the dominant senders (sources) and receivers (targets) in a 2D space and describe your findings. Suggest some possible biological interpretations
pairLR.wnt <- extractEnrichedLR(cellchat, signaling = "WNT", geneLR.return = FALSE)
LR.show <- pairLR.wnt[1,] # show one ligand-receptor pair
# Hierarchy plot
vertex.receiver = seq(1,4) # a numeric vector
netVisual_individual(cellchat, signaling = "WNT",  pairLR.use = LR.show, vertex.receiver = vertex.receiver)
## [[1]]
# Circle plot
netVisual_individual(cellchat, signaling = "WNT", pairLR.use = LR.show, layout = "circle")

## [[1]]
# Chord diagram
netVisual_individual(cellchat, signaling = "WNT", pairLR.use = LR.show, layout = "chord")

## [[1]]
# HEAT MAP
# Compute the network centrality scores
cellchat <- netAnalysis_computeCentrality(cellchat, slot.name = "netP") # the slot 'netP' means the inferred intercellular communication network of signaling pathways


# Visualize the computed centrality scores using heatmap, allowing ready identification of major signaling roles of cell groups
netAnalysis_signalingRole_network(cellchat, signaling = "WNT", width = 8, height = 2.5, font.size = 10)

#The centrality scores show the relatedness or importance  of signalalling roles  being recieved, sent, mediated, or influenced relation to each other. For this specific example the centrality scores describe WNT signalling between cells.



#DOMINANT SENDERS - Lung proginator and Endoderm. THis makes sense because lung proginator is used for lung regeneration and repair therefore it would need to be strong in sending Wnt signals because WNt signal pathway. is responsible for development of embryos. Endoderm is very associated with layer of tissue during development as well.

#DOMINANT RECIEVERS  - Endoderm. THis makes sense because the WNT is used for development and the endoderm is layers of tissue in development of embryo

(20 pts) The Khan dataset in ISLR library (library(ISLR)) consists of a number of tissue samples corresponding to four distinct types of small round blue cell tumors. For each tissue sample, gene expression measurements are available. The data set consists of training data, xtrain and ytrain, and testing data, xtest and ytest.

  1. Use random forests to analyze this data (need to convert ytrain/ytest into a qualitative response variable). What is the value for mtry in the randomForest() function and why?

  2. Show the confusion matrix and overall fraction of correct predictions in the testing data.

  3. Use the importance() function to determine the top 10 genes that are most important for the classification. What are the top 10 genes?

library(e1071)
## Registered S3 methods overwritten by 'proxy':
##   method               from    
##   print.registry_field registry
##   print.registry_entry registry
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:CellChat':
## 
##     dotPlot
library(ISLR)
library(dplyr)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following object is masked from 'package:BiocGenerics':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
data(Khan)

# Convert response variables to factors
xtrain = Khan$xtrain
ytrain <- as.factor(Khan$ytrain)
ytest <- as.factor(Khan$ytest)

# Train the random forest model
rf_model <- randomForest(x = Khan$xtrain, y = ytrain)

# Print the model summary
print(rf_model)
## 
## Call:
##  randomForest(x = Khan$xtrain, y = ytrain) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 48
## 
##         OOB estimate of  error rate: 0%
## Confusion matrix:
##   1  2  3  4 class.error
## 1 8  0  0  0           0
## 2 0 23  0  0           0
## 3 0  0 12  0           0
## 4 0  0  0 20           0
#mtry is the number of variables sampled as candidates at each split. The value for mtry is 48. mtry is responsible for helping the model not be too biased or too variant. mtry is responsible for making the model accurate


pred_test <- predict(rf_model, newdata = Khan$xtest)

# Show confusion matrix
conf_matrix <- table(Actual = ytest, Predicted = pred_test)
print("Confusion Matrix:")
## [1] "Confusion Matrix:"
print(conf_matrix)
##       Predicted
## Actual 1 2 3 4
##      1 3 0 0 0
##      2 0 6 0 0
##      3 0 1 4 1
##      4 0 0 0 5
# Calculate overall fraction of correct predictions
accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
print("Overall Fraction of Correct Predictions:")
## [1] "Overall Fraction of Correct Predictions:"
print(accuracy)
## [1] 0.9
importance_scores <- importance(rf_model)

# Extract the MeanDecreaseGini scores (or any other relevant measure)
mean_decrease_gini <-  importance_scores[, "MeanDecreaseGini"]

# Sort the scores in descending order to identify the top 10 genes
top_10_genes <- names(sort(mean_decrease_gini, decreasing = TRUE)[1:10])

# Print the names of the top 10 genes
print("Top 10 Genes:")
## [1] "Top 10 Genes:"
print(top_10_genes)
##  [1] "1954" "246"  "2050" "1003" "545"  "1389" "1955" "509"  "1645" "1319"
  1. Try different values of nodesize, ntree, and maxnodes in the randomForest() function, perform 5-fold cross-validation to find the best model among them. Show the confusion matrix of the best model and report the parameters.
  2. Use a support vector approach (svm() function) to predict cancer subtype using gene expression measurements. Choose an appropriate kernel and explain the reason.
dim(Khan$xtrain)
## [1]   63 2308
dim(ytrain)
## NULL
parameter_grid <- expand.grid(
  mtry = c(2, 4, 6), 
  nodesize = c(1, 5, 10),
  ntree = c(100, 200, 300),
  maxnodes = c(5, 10, 15)
)

# Initialize variables to store the best model and its performance
best_model <- NULL
best_accuracy <- 0

# Perform cross-validation for each parameter combination
for (i in 1:nrow(parameter_grid)) {
  # Train the random forest model with current parameters
  rf_model <- randomForest(x = Khan$xtrain, 
                           y = ytrain,
                           mtry = parameter_grid$mtry[i],
                           nodesize = parameter_grid$nodesize[i],
                           ntree = parameter_grid$ntree[i],
                           maxnodes = parameter_grid$maxnodes[i])
  
  # Make predictions on the testing data
  pred_test <- predict(rf_model, newdata = Khan$xtest)
  
  # Calculate accuracy
  accuracy <- mean(pred_test == ytest)
  
  # Check if this model has higher accuracy than the best model so far
  if (accuracy > best_accuracy) {
    best_accuracy <- accuracy
    best_model <- rf_model
  }
}

# Show the best model and its accuracy
print("Best Model:")
## [1] "Best Model:"
print(best_model)
## 
## Call:
##  randomForest(x = Khan$xtrain, y = ytrain, ntree = parameter_grid$ntree[i],      mtry = parameter_grid$mtry[i], nodesize = parameter_grid$nodesize[i],      maxnodes = parameter_grid$maxnodes[i]) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 6
## 
##         OOB estimate of  error rate: 6.35%
## Confusion matrix:
##   1  2  3  4 class.error
## 1 6  1  1  0  0.25000000
## 2 0 21  0  2  0.08695652
## 3 0  0 12  0  0.00000000
## 4 0  0  0 20  0.00000000
print("Best Model Accuracy:")
## [1] "Best Model Accuracy:"
print(best_accuracy)
## [1] 0.85
# Create a trainControl object for 5-fold cross-validation
ctrl <- trainControl(method = "cv", 
                     number = 5,
                     summaryFunction = twoClassSummary,
                     classProbs = TRUE,
                     verboseIter = TRUE)






svm_model <- svm(ytrain ~ ., data = Khan$xtrain, kernel = "polynomial", degree = 3, coef0 = 1)

svm_model
## 
## Call:
## svm(formula = ytrain ~ ., data = Khan$xtrain, kernel = "polynomial", 
##     degree = 3, coef0 = 1)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  polynomial 
##        cost:  1 
##      degree:  3 
##      coef.0:  1 
## 
## Number of Support Vectors:  63
  1. What is the overall fraction of correct predictions in the testing data?
  2. Try different values of cost, kernel in the svm() function, perform 5-fold cross- validation to find the best model among them. Show the confusion matrix of the best model and report the parameters.
library(e1071)
# Make predictions on the testing dataset
pred_test <- predict(rf_model, newdata = Khan$xtest)

# Compare predictions to true labels
correct_predictions <- sum(pred_test == ytest)

# Calculate total number of predictions
total_predictions <- length(ytest)

# Calculate accuracy
accuracy <- correct_predictions / total_predictions

# Print the overall fraction of correct predictions
print("Overall Fraction of Correct Predictions:")
## [1] "Overall Fraction of Correct Predictions:"
print(accuracy)
## [1] 0.8
parameter_grid <- expand.grid(cost = c(0.1, 1, 10),
                               kernel = c("linear", "radial", "polynomial"))

# Create a trainControl object for 5-fold cross-validation
ctrl <- trainControl(method = "cv", number = 5)

# Initialize variables to store the best model and its performance
best_model <- NULL
best_accuracy <- 0

# Perform cross-validation for each parameter combination
for (i in 1:nrow(parameter_grid)) {
  # Train the SVM model with current parameters
  svm_model <- svm(ytrain ~ ., 
                   data = Khan$xtrain,
                   kernel = parameter_grid$kernel[i],
                   cost = parameter_grid$cost[i])
  
  # Make predictions on the testing data
  pred_test <- predict(svm_model, newdata = Khan$xtest)
  
  # Calculate accuracy
  accuracy <- mean(pred_test == ytest)
  
  # Check if this model has higher accuracy than the best model so far
  if (accuracy > best_accuracy) {
    best_accuracy <- accuracy
    best_model <- svm_model
  }
}

# Show the best model and its accuracy
print("Best SVM Model:")
## [1] "Best SVM Model:"
print(best_model)
## 
## Call:
## svm(formula = ytrain ~ ., data = Khan$xtrain, kernel = parameter_grid$kernel[i], 
##     cost = parameter_grid$cost[i])
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.1 
## 
## Number of Support Vectors:  58
print("Best Model Accuracy:")
## [1] "Best Model Accuracy:"
print(best_accuracy)
## [1] 0.9
# Make predictions on the test set
pred_test <- predict(best_model, newdata = Khan$xtest)

# Show confusion matrix
conf_matrix <- table(Actual = ytest, Predicted = pred_test)
print("Confusion Matrix:")
## [1] "Confusion Matrix:"
print(conf_matrix)
##       Predicted
## Actual 1 2 3 4
##      1 3 0 0 0
##      2 0 6 0 0
##      3 0 2 4 0
##      4 0 0 0 5
# Report the parameters of the best model
print("Best Model Parameters:")
## [1] "Best Model Parameters:"
print(best_model$cost)
## [1] 0.1
print(best_model$kernel)
## [1] 0